In [1]:
%%capture
%run "4 - Linear Algebra.ipynb"
%run "5 - Statistics.ipynb"
%run "6 - Probability.ipynb"
%run "8 - Gradient Descent.ipynb"
In [2]:
def bucketize(point, bucket_size):
"""floor the point to the next lower multiple of bucket size"""
return bucket_size * math.floor(point / bucket_size)
def make_histogram(points, bucket_size):
return Counter(bucketize(point, bucket_size) for point in points)
def plot_histogram(points, bucket_size, title=''):
histogram = make_histogram(points, bucket_size)
plt.bar(list(histogram.keys()), histogram.values(), width=bucket_size, edgecolor='white')
plt.title(title)
plt.show()
In [3]:
import random
random.seed(0)
# uniform #'s will be between -100 and 100
uniform = [200 * random.random() - 100 for _ in range(10000)]
# normal #'s will be from the normal distribution with mean of 0 and standard deviation of 57
normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]
In [4]:
plot_histogram(uniform, 10, 'Uniform Histogram')
In [5]:
plot_histogram(normal, 10, 'Normal Histogram')
In [6]:
def random_normal():
"""retuns random #'s from standard normal distribution"""
return inverse_normal_cdf(random.random())
xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]
ys3 = [0 if x < 0 else 6 for x in xs]
In [7]:
plt.scatter(xs, ys1, marker='.', color='black', label='ys1');
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2');
plt.xlabel('xs');
plt.ylabel('ys');
plt.legend(loc=5);
plt.title('Very Different Joint Distributions');
In [8]:
correlation(xs, ys1), correlation(xs, ys2)
Out[8]:
In [9]:
def correlation_matrix(data):
"""returns a matrix where x_ij is the correlation between col i and col j"""
_, num_cols = shape(data)
def matrix_entry(i, j):
return correlation(get_column(data, i), get_column(data, j))
return make_matrix(num_cols, num_cols, matrix_entry)
In [10]:
data = [[x, y2, y1, ys3] for (x, y1, y2, ys3) in zip(xs, ys1, ys2, ys3)]
correlation_matrix(data)
Out[10]:
In [11]:
_, num_columns = shape(data)
fig, ax = plt.subplots(num_columns, num_columns)
for i in range(num_columns):
for j in range(num_columns):
# scatter column_j on the x-axis vs column_i on the y-axis
if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i))
# unless i == j, in which case show the series name
else: ax[i][j].annotate("series " + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va="center")
# then hide axis labels except left and bottom charts
if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
if j > 0: ax[i][j].yaxis.set_visible(False)
# fix the bottom right and top left axis labels, which are wrong because
# their charts only have text in them
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())
Out[11]:
In [12]:
def parse_row(input_row, parsers):
"""given a list of parsers (some of which may be None)
apply the appropriate one to each element of the input_row"""
return [parser(value) if parser is not None else value for value, parser in zip(input_row, parsers)]
def parse_rows_with(reader, parsers):
"""wrap a reader to apply the parsers to each of its rows"""
for row in reader:
yield parse_row(row, parsers)
In [13]:
def try_or_none(f):
"""wraps f to return None if f raises an exception
assumes f takes only one input"""
def f_or_none(x):
try: return f(x)
except: return None
return f_or_none
In [14]:
def parse_row(input_row, parsers):
return [try_or_none(parser)(value) if parser is not None else value for value, parser in zip(input_row, parsers)]
In [15]:
import dateutil.parser
import csv
data = []
with open("stocks.csv", "r") as f:
reader = csv.reader(f)
for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
data.append(line)
data
Out[15]:
In [16]:
max_aapl_price = max(row[2]
for row in data
if row[1] == "AAPL")
max_aapl_price
Out[16]:
In [17]:
import pandas as pd
pd.DataFrame({'Height (inches)': [63, 67, 70], 'Height (centimeters)': [160, 170.2, 177.8], 'Weight': [150, 160, 171]}, index=['A', 'B', 'C'])
Out[17]:
If we use inches, B's closest neighbor is A:
In [18]:
a_to_b = distance([63, 150], [67, 160])
a_to_c = distance([63, 150], [70, 171])
b_to_c = distance([67, 160], [70, 171])
a_to_b, a_to_c, b_to_c
Out[18]:
But if we use centimeters, the closest neighbor is C:
In [19]:
a_to_b = distance([160, 150], [170.2, 160])
a_to_c = distance([160, 150], [177.8, 171])
b_to_c = distance([170.2, 160], [177.8, 171])
a_to_b, a_to_c, b_to_c
Out[19]:
When results are impacted by units of measure like in the above example, scaling can help. Rescaling projects values into a dimension with mean 0 and standard deviation 1. This effectively converts data into a common unit of "standard deviations from the mean."
In [20]:
def scale(data_matrix):
"""returns the means and standard deviations of each column"""
num_rows, num_cols = shape(data_matrix)
means = [mean(get_column(data_matrix,j)) for j in range(num_cols)]
stdevs = [standard_deviation(get_column(data_matrix,j)) for j in range(num_cols)]
return means, stdevs
In [21]:
def rescale(data_matrix):
"""rescales the input data so that each column
has mean 0 and standard deviation 1
leaves alone columns with no deviation"""
means, stdevs = scale(data_matrix)
def rescaled(i, j):
if stdevs[j] > 0:
return (data_matrix[i][j] - means[j]) / stdevs[j]
else:
return data_matrix[i][j]
num_rows, num_cols = shape(data_matrix)
return make_matrix(num_rows, num_cols, rescaled)
Rescale our weights so that they do not differ by units:
In [22]:
hts_in = [[63], [67], [70]]
hts_cm = [[160], [170.2], [177.8]]
wts = [[150], [160], [171]]
hts_in_rescaled = rescale(hts_in)
hts_cm_rescaled = rescale(hts_cm)
wts_rescaled = rescale(wts)
hts_in_rescaled, hts_cm_rescaled, wts_rescaled
Out[22]:
Now when we calculate distances, you can see that we end up with almost equivalent calculations for inches as we do centimeters:
In [23]:
a_to_b_rescaled = distance([hts_in_rescaled[0][0], wts_rescaled[0][0]], [hts_in_rescaled[1][0], wts_rescaled[1][0]])
a_to_c_rescaled = distance([hts_in_rescaled[0][0], wts_rescaled[0][0]], [hts_in_rescaled[2][0], wts_rescaled[2][0]])
b_to_c_rescaled = distance([hts_in_rescaled[1][0], wts_rescaled[1][0]], [hts_in_rescaled[2][0], wts_rescaled[2][0]])
a_to_b_rescaled, a_to_c_rescaled, b_to_c_rescaled
Out[23]:
In [24]:
a_to_b_rescaled = distance([hts_cm_rescaled[0][0], wts_rescaled[0][0]], [hts_cm_rescaled[1][0], wts_rescaled[1][0]])
a_to_c_rescaled = distance([hts_cm_rescaled[0][0], wts_rescaled[0][0]], [hts_cm_rescaled[2][0], wts_rescaled[2][0]])
b_to_c_rescaled = distance([hts_cm_rescaled[1][0], wts_rescaled[1][0]], [hts_cm_rescaled[2][0], wts_rescaled[2][0]])
a_to_b_rescaled, a_to_c_rescaled, b_to_c_rescaled
Out[24]:
In [25]:
pca_data = [[7, 4, 3],
[4, 1, 8],
[6, 3, 5],
[8, 6, 1],
[8, 5, 7],
[7, 2, 9],
[5, 3, 3],
[9, 5, 8],
[7, 4, 5],
[8, 2, 2]]
pca_xs = [random_normal() for _ in range(100)]
pca_ys1 = [x + random_normal() / 2 for x in pca_xs]
pca_ys2 = [-x + random_normal() / 2 for x in pca_xs]
pca_ys3 = [0 if x < 0 else 6 for x in pca_xs]
pca_data = [[x, y1, y2, y3] for (x, y1, y2, y3) in zip(pca_xs, pca_ys1, pca_ys2, pca_ys3)]
plt.scatter(pca_xs, pca_ys1);
plt.scatter(pca_xs, pca_ys2);
plt.scatter(pca_xs, pca_ys3);
In [26]:
def de_mean_matrix(A):
"""Returns each value in columns of A minus the mean of
that column resulting in mean of 0 for all columns"""
nr, nc = shape(A)
column_means, _ = scale(A)
return make_matrix(nr, nc, lambda i, j: A[i][j] - column_means[j])
pca_data_de_mean = de_mean_matrix(pca_data)
pca_data_de_mean[:5]
Out[26]:
In [27]:
plt.scatter([x[0] for x in pca_data_de_mean], [0 for _ in pca_data_de_mean]);
plt.scatter([x[1] for x in pca_data_de_mean], [0.1 for _ in pca_data_de_mean]);
plt.scatter([x[2] for x in pca_data_de_mean], [0.2 for _ in pca_data_de_mean]);
plt.scatter([x[3] for x in pca_data_de_mean], [0.3 for _ in pca_data_de_mean]);
frame = plt.gca()
frame.axes.yaxis.set_visible(False);
plt.ylim((-0.1, 1));
In [28]:
plt.scatter([x[0] for x in pca_data_de_mean], [x[1] for x in pca_data_de_mean]);
plt.scatter([x[0] for x in pca_data_de_mean], [x[2] for x in pca_data_de_mean]);
plt.scatter([x[0] for x in pca_data_de_mean], [x[3] for x in pca_data_de_mean]);
With the de-meaned matrix, we can figure out which direction captures the most variance in the data:
In [29]:
def direction(w):
mag = magnitude(w)
return [w_i / mag for w_i in w]
pca_data_direction = [direction(w) for w in pca_data_de_mean]
pca_data_direction[:5]
Out[29]:
In [30]:
plt.scatter([x[0] for x in pca_data_direction], [x[1] for x in pca_data_direction]);
plt.scatter([x[0] for x in pca_data_direction], [x[2] for x in pca_data_direction]);
plt.scatter([x[0] for x in pca_data_direction], [x[3] for x in pca_data_direction]);
Now we can compute the variance of the data in the direction, w:
In [31]:
def directional_variance_i(x_i, w):
"""the variance of the row x_i in the direction determined by w"""
return dot(x_i, direction(w)) ** 2
def directional_variance(X, w):
"""the variance of the data in the direction determined w"""
return sum(directional_variance_i(x_i, w) for x_i in X)
def directional_variance_gradient_i(x_i, w):
"""the contribution of row x_i to the gradient of
the direction-w variance"""
projection_length = dot(x_i, direction(w))
return [2 * projection_length * x_ij for x_ij in x_i]
def directional_variance_gradient(X, w):
return vector_sum(directional_variance_gradient_i(x_i, w) for x_i in X)
pca_data_variance = [directional_variance(pca_data_direction, w) for w in pca_data_direction]
pca_data_variance[:5]
Out[31]:
The first principal component is the direction that maximizes the directional variance above.
In [32]:
def first_principal_component(X):
guess = [1 for _ in X[0]]
unscaled_maximizer = maximize_batch(
partial(directional_variance, X), # is now a function of w
partial(directional_variance_gradient, X), # is now a function of w
guess)
return direction(unscaled_maximizer)
first_principal_component(pca_data)
Out[32]:
We can use stochastic gradient descent here:
In [33]:
def first_principal_component_sgd(X):
guess = [1 for _ in X[0]]
unscaled_maximizer = maximize_stochastic(
lambda x, _, w: directional_variance_i(x, w),
lambda x, _, w: directional_variance_gradient_i(x, w),
X,
[None for _ in X], # the fake "y"
guess)
return direction(unscaled_maximizer)
first_principal_component_sgd(pca_data)
Out[33]:
In [34]:
def project(v, w):
"""returns the projection of v onto direction w"""
projection_length = dot(v, w)
return scalar_multiply(projection_length, w)
In [35]:
def remove_projection_from_vector(v, w):
"""projects v onto w and substracts the result from v"""
return vector_subtract(v, project(v, w))
def remove_projection(X, w):
return [remove_projection_from_vector(x_i, w) for x_i in X]
Given a three-dimensional data set, it becomes effectively two-dimensional when we remove the first component.
In [36]:
pca_data_fc_removed = remove_projection(pca_data, first_principal_component_sgd(pca_data))
pca_data_fc_removed[:5]
Out[36]:
In [37]:
plt.scatter([x[0] for x in pca_data_fc_removed], [x[1] for x in pca_data_fc_removed]);
plt.scatter([x[0] for x in pca_data_fc_removed], [x[2] for x in pca_data_fc_removed]);
plt.scatter([x[0] for x in pca_data_fc_removed], [x[3] for x in pca_data_fc_removed]);
For higher-dimensional data sets, iterate to find as many components as you like:
In [38]:
def principal_component_analysis(X, num_components):
components = []
for _ in range(num_components):
component = first_principal_component(X)
components.append(component)
X = remove_projection(X, component)
return components
Then transform the data into a lower-dimensional space:
In [39]:
def transform_vector(v, components):
return [dot(v, w) for w in components]
def transform(X, components):
return [transform_vector(x_i, components) for x_i in X]
In [40]:
pca_components = principal_component_analysis(pca_data, 3)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]
Out[40]:
In [41]:
plt.scatter([x[0] for x in pca_transformed], [x[1] for x in pca_transformed]);
plt.scatter([x[0] for x in pca_transformed], [x[2] for x in pca_transformed]);
plt.title('3-dimensions n=' + str(len(pca_transformed)));
In [42]:
pca_components = principal_component_analysis(pca_data, 2)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]
Out[42]:
In [43]:
plt.scatter([x[0] for x in pca_transformed], [x[1] for x in pca_transformed]);
plt.title('2-dimensions n=' + str(len(pca_transformed)));
In [44]:
pca_components = principal_component_analysis(pca_data, 1)
pca_transformed = transform(pca_data, pca_components)
pca_transformed[:5]
Out[44]:
In [45]:
plt.scatter([x[0] for x in pca_transformed], [0 for _ in pca_transformed]);
plt.title('1-dimension n=' + str(len(pca_transformed)));
Dimensionality reduction helps with analysis in two ways:
One downside, however is that it can make models harder to interpret because of the collapsing of variables.
In [ ]: